This article provides the steps to create population estimates for age and sex combinations from 1995 to 2000 at SA2-level (Australia | 2021 boundaries) and 1995 to 2023 at suburb-level (Queensland, Australia | 2016 and 2021 boundaries)
| Data Property | Specifications |
|---|---|
| Age | 5-year groups |
| Sex | Male, female, persons |
| Spatial scope | Australia |
| Spatial resolution | SA2 |
| Temporal scope | 2001 - 2023 |
| Temporal resolution | Yearly |
| Reference | Australian Bureau of Statistics. (2023). Regional population by age and sex. ABS. https://www.abs.gov.au/statistics/people/population/regional-population-age-and-sex/2023. |
# Load data
erp_sa2_2001_2023_asgs2021_m <- as.data.table(read_excel("ABS data/erp_sa2_2001_2023_asgs2021.xlsx",
sheet = "Table 1", skip = 6))
erp_sa2_2001_2023_asgs2021_f <- as.data.table(read_excel("ABS data/erp_sa2_2001_2023_asgs2021.xlsx",
sheet = "Table 2", skip = 6))
erp_sa2_2001_2023_asgs2021_p <- as.data.table(read_excel("ABS data/erp_sa2_2001_2023_asgs2021.xlsx",
sheet = "Table 3", skip = 6))
# Rename SA2 code and SA2 name to SA2_CODE and SA2_NAME for consistency with other boundary datasets
setnames(erp_sa2_2001_2023_asgs2021_m, old = "SA2 code", new = "SA2_CODE")
setnames(erp_sa2_2001_2023_asgs2021_f, old = "SA2 code", new = "SA2_CODE")
setnames(erp_sa2_2001_2023_asgs2021_p, old = "SA2 code", new = "SA2_CODE")
setnames(erp_sa2_2001_2023_asgs2021_m, old = "SA2 name", new = "SA2_NAME")
setnames(erp_sa2_2001_2023_asgs2021_f, old = "SA2 name", new = "SA2_NAME")
setnames(erp_sa2_2001_2023_asgs2021_p, old = "SA2 name", new = "SA2_NAME")
# Remove columns not needed
rem_cols <- c("S/T code", "S/T name", "GCCSA code", "GCCSA name", "SA4 code",
"SA4 name", "SA3 code", "SA3 name", "SA2_CODE")
# Checked that no SA2_NAMEs are getting duplicated
# anyDuplicated(erp_sa2_2001_2023_asgs2021_f[1:2288,]$SA2_NAME)
erp_sa2_2001_2023_asgs2021_m[, (rem_cols) := NULL]
erp_sa2_2001_2023_asgs2021_f[, (rem_cols) := NULL]
erp_sa2_2001_2023_asgs2021_p[, (rem_cols) := NULL]
rm(rem_cols)
# Aggregate desired age groups
# Age groups in the data: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-49,
# 50-54, 55-59, 60-64, 65-69, 70-74, 75-79, 80-84, 85 and over
# Desired age groups for CVD research: 00-24, 25-44, 45-64, 65-74, 75-84, 85 and over
# 85 and over is written as 85-99 for maintaining naming consistency
erp_sa2_2001_2023_asgs2021_m[, `m_00_24` := rowSums(.SD[, c("0-4", "5-9", "10-14", "15-19", "20-24")])]
erp_sa2_2001_2023_asgs2021_m[, `m_25_44` := rowSums(.SD[, c("25-29", "30-34", "35-39", "40-44")])]
erp_sa2_2001_2023_asgs2021_m[, `m_45_64` := rowSums(.SD[, c("45-49", "50-54", "55-59", "60-64")])]
erp_sa2_2001_2023_asgs2021_m[, `m_65_74` := rowSums(.SD[, c("65-69", "70-74")])]
erp_sa2_2001_2023_asgs2021_m[, `m_75_84` := rowSums(.SD[, c("75-79", "80-84")])]
erp_sa2_2001_2023_asgs2021_m[, `m_85_99` := rowSums(.SD[, c("85 and over")])]
erp_sa2_2001_2023_asgs2021_m[, `m_all` := rowSums(.SD[, c("Total males")])]
erp_sa2_2001_2023_asgs2021_f[, `f_00_24` := rowSums(.SD[, c("0-4", "5-9", "10-14", "15-19", "20-24")])]
erp_sa2_2001_2023_asgs2021_f[, `f_25_44` := rowSums(.SD[, c("25-29", "30-34", "35-39", "40-44")])]
erp_sa2_2001_2023_asgs2021_f[, `f_45_64` := rowSums(.SD[, c("45-49", "50-54", "55-59", "60-64")])]
erp_sa2_2001_2023_asgs2021_f[, `f_65_74` := rowSums(.SD[, c("65-69", "70-74")])]
erp_sa2_2001_2023_asgs2021_f[, `f_75_84` := rowSums(.SD[, c("75-79", "80-84")])]
erp_sa2_2001_2023_asgs2021_f[, `f_85_99` := rowSums(.SD[, c("85 and over")])]
erp_sa2_2001_2023_asgs2021_f[, `f_all` := rowSums(.SD[, c("Total females")])]
erp_sa2_2001_2023_asgs2021_p[, `p_00_24` := rowSums(.SD[, c("0-4", "5-9", "10-14", "15-19", "20-24")])]
erp_sa2_2001_2023_asgs2021_p[, `p_25_44` := rowSums(.SD[, c("25-29", "30-34", "35-39", "40-44")])]
erp_sa2_2001_2023_asgs2021_p[, `p_45_64` := rowSums(.SD[, c("45-49", "50-54", "55-59", "60-64")])]
erp_sa2_2001_2023_asgs2021_p[, `p_65_74` := rowSums(.SD[, c("65-69", "70-74")])]
erp_sa2_2001_2023_asgs2021_p[, `p_75_84` := rowSums(.SD[, c("75-79", "80-84")])]
erp_sa2_2001_2023_asgs2021_p[, `p_85_99` := rowSums(.SD[, c("85 and over")])]
erp_sa2_2001_2023_asgs2021_p[, `p_all` := rowSums(.SD[, c("Total persons")])]
# Remove original age group columns
rem_cols <- c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85 and over")
erp_sa2_2001_2023_asgs2021_m[, (rem_cols) := NULL]
erp_sa2_2001_2023_asgs2021_m[, `Total males` := NULL]
erp_sa2_2001_2023_asgs2021_f[, (rem_cols) := NULL]
erp_sa2_2001_2023_asgs2021_f[, `Total females` := NULL]
erp_sa2_2001_2023_asgs2021_p[, (rem_cols) := NULL]
erp_sa2_2001_2023_asgs2021_p[, `Total persons` := NULL]
rm(rem_cols)
# Combine all sexes
erp_sa2_2001_2023_asgs2021 <- merge(erp_sa2_2001_2023_asgs2021_m, erp_sa2_2001_2023_asgs2021_f, by = c("Year", "SA2_NAME"))
erp_sa2_2001_2023_asgs2021 <- merge(erp_sa2_2001_2023_asgs2021, erp_sa2_2001_2023_asgs2021_p, by = c("Year", "SA2_NAME"))
rm(erp_sa2_2001_2023_asgs2021_m, erp_sa2_2001_2023_asgs2021_f, erp_sa2_2001_2023_asgs2021_p)
We will calculate compound annual growth rate for backcasting (opposite of forecasting) based on how population changed from its starting point to end point and considering how many years it took.
https://www.investopedia.com/terms/g/growthrates.asp provides good explanation of this.
The formula for compound annual growth rate is: [(start_value / end_value) ^ 1/years] - 1
# Reshape the data to long format
erp_long <- melt(erp_sa2_2001_2023_asgs2021,
id.vars = c("Year", "SA2_NAME"),
variable.name = "Group",
value.name = "Population")
years_to_predict <- 1995:2000
## Growth rate backcasting ##
# Function for growth rate backcasting
extrapolate_backcasting <- function(data, years_to_predict) {
# Years since start to consider for growth rate
n_years <- 1
# 1 year performed best against state level ERP (done in later steps)
# Calculate the annualized growth rate
start_pop <- data$Population[data$Year == min(data$Year)]
end_pop <- data$Population[data$Year == (min(data$Year) + n_years)]
# Handle cases where start_pop is 0 (to avoid division by zero)
if (start_pop == 0) {
return(data.frame(Year = years_to_predict, Population = 0))
}
# Calculate growth rate
growth_rate <- ((end_pop / start_pop)^(1 / n_years)) - 1
# If negative growth rate, reverse the equation to ensure past predictions are higher
# Negative growth rates can overestiamte populations of past years, so we will reduce it by a third
if (growth_rate < 0) {
predictions <- sapply(years_to_predict, function(year) {
predicted_pop <- start_pop * (1 + abs(growth_rate/3))^(min(data$Year) - year)
# Ensure population is not negative or NaN
if (is.na(predicted_pop) || predicted_pop < 0) {
predicted_pop <- 0
}
# Round to the nearest integer
predicted_pop <- round(predicted_pop)
return(predicted_pop)
})
} else {
# If starting population is less than or equal to ending population (positive or zero growth),
# use the original formula
predictions <- sapply(years_to_predict, function(year) {
predicted_pop <- start_pop / (1 + growth_rate)^(min(data$Year) - year)
# Ensure population is not negative or NaN
if (is.na(predicted_pop) || predicted_pop < 0) {
predicted_pop <- 0
}
# Round to the nearest integer
predicted_pop <- round(predicted_pop)
return(predicted_pop)
})
}
return(data.frame(Year = years_to_predict, Population = predictions))
}
# Apply growth rate backcasting to each group
backcasting_data <- erp_long %>%
group_by(`SA2_NAME`, Group) %>%
do(extrapolate_backcasting(., years_to_predict)) %>%
ungroup()
# Combine with original data
full_data_backcasting <- bind_rows(erp_long, backcasting_data)
# Reshape back to wide format
erp_backcasting <- dcast(full_data_backcasting, Year + `SA2_NAME` ~ Group, value.var = "Population")
# Sort the final data
setorder(erp_backcasting, Year, `SA2_NAME`)
# Clean up environment
rm(erp_long, years_to_predict, backcasting_data, full_data_backcasting, extrapolate_backcasting)
dt <- copy(erp_backcasting)
# Subset data for 20 random SA2_NAMEs
set.seed(123)
sa2_names <- sample(unique(dt$`SA2_NAME`), 20)
dt <- dt[`SA2_NAME` %in% sa2_names]
plot_ly(
data = dt,
x = ~Year,
# y = ~m_all,
# y = ~f_all,
y = ~p_all,
color = ~`SA2_NAME`,
type = 'scatter',
mode = 'lines',
hoverinfo = 'text',
text = ~paste("Year:", Year
, "<br>SA2:", `SA2_NAME`
, "<br>p_all:", p_all)
) %>%
layout(
title = "Australia SA2-level population estimates (persons) for 1995-2023",
xaxis = list(title = "Year"),
yaxis = list(title = "p_all"),
legend = list(title = list(text = "SA2_NAME"))
) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Australia SA2-level population estimates (persons) for 1995-2023'))
# Aggregate by year column and sum all columns disregarding SA2_NAME
dt_agg <- copy(erp_backcasting)
dt_agg <- dt_agg[, `SA2_NAME` := NULL]
dt_agg <- dt_agg[, lapply(.SD, sum), by = Year]
dt_agg <- melt(dt_agg, id.vars = c("Year"), variable.name = "Category", value.name = "Count")
plot_ly(
# data = dt,
data = dt_agg,
x = ~Year,
# y = ~m_all,
# y = ~f_all,
y = ~Count,
color = ~Category,
type = 'scatter',
mode = 'lines', # Line plot
hoverinfo = 'text',
text = ~paste("Year:", Year
# , "<br>SA2:", `SA2_NAME`
, "<br>Count:", Count)
) %>%
layout(
title = "Population Over Time by SA2",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"),
legend = list(title = list(text = "SA2_NAME"))
) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Australia population estimates by age and sex for 1995-2023'))
# Remove temporary objects
rm(dt, dt_agg, sa2_names, p)
| Data Property | Specifications |
|---|---|
| Age | 1-year |
| Sex | Male, female, persons |
| Spatial scope | Australia |
| Spatial resolution | State |
| Temporal scope | 1971 - 2024 |
| Temporal resolution | Yearly |
| Reference | Australian Bureau of Statistics. (2024, June). National, state and territory population. ABS. https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/jun-2024. |
We can use these state-level ABS estimates to adjust our SA2-level extrapolations
# New South Wales
erp_state_1971_2024_nsw_1 <-
as.data.table(read_excel("ABS data/TABLE 51. Estimated Resident Population By Single Year Of Age, New South Wales.xlsx",
sheet = "Data1"))
erp_state_1971_2024_nsw_2 <-
as.data.table(read_excel("ABS data/TABLE 51. Estimated Resident Population By Single Year Of Age, New South Wales.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_nsw <- merge(erp_state_1971_2024_nsw_1, erp_state_1971_2024_nsw_2,
by = names(erp_state_1971_2024_nsw_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_nsw[, STATE := "nsw"]
# Delete the last 9 rows
erp_state_1971_2024_nsw <- erp_state_1971_2024_nsw[1:(dim(erp_state_1971_2024_nsw)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_nsw_1, erp_state_1971_2024_nsw_2)
# Victoria
erp_state_1971_2024_vic_1 <-
as.data.table(read_excel("ABS data/TABLE 52. Estimated Resident Population By Single Year Of Age, Victoria.xlsx",
sheet = "Data1"))
erp_state_1971_2024_vic_2 <-
as.data.table(read_excel("ABS data/TABLE 52. Estimated Resident Population By Single Year Of Age, Victoria.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_vic <- merge(erp_state_1971_2024_vic_1, erp_state_1971_2024_vic_2,
by = names(erp_state_1971_2024_vic_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_vic[, STATE := "vic"]
# Delete the last 9 rows
erp_state_1971_2024_vic <- erp_state_1971_2024_vic[1:(dim(erp_state_1971_2024_vic)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_vic_1, erp_state_1971_2024_vic_2)
# Queensland
erp_state_1971_2024_qld_1 <-
as.data.table(read_excel("ABS data/TABLE 53. Estimated Resident Population By Single Year Of Age, Queensland.xlsx",
sheet = "Data1"))
erp_state_1971_2024_qld_2 <-
as.data.table(read_excel("ABS data/TABLE 53. Estimated Resident Population By Single Year Of Age, Queensland.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_qld <- merge(erp_state_1971_2024_qld_1, erp_state_1971_2024_qld_2,
by = names(erp_state_1971_2024_qld_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_qld[, STATE := "qld"]
# Delete the last 9 rows
erp_state_1971_2024_qld <- erp_state_1971_2024_qld[1:(dim(erp_state_1971_2024_qld)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_qld_1, erp_state_1971_2024_qld_2)
# South Australia
erp_state_1971_2024_sa_1 <-
as.data.table(read_excel("ABS data/TABLE 54. Estimated Resident Population By Single Year Of Age, South Australia.xlsx",
sheet = "Data1"))
erp_state_1971_2024_sa_2 <-
as.data.table(read_excel("ABS data/TABLE 54. Estimated Resident Population By Single Year Of Age, South Australia.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_sa <- merge(erp_state_1971_2024_sa_1, erp_state_1971_2024_sa_2,
by = names(erp_state_1971_2024_sa_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_sa[, STATE := "sa"]
# Delete the last 9 rows
erp_state_1971_2024_sa <- erp_state_1971_2024_sa[1:(dim(erp_state_1971_2024_sa)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_sa_1, erp_state_1971_2024_sa_2)
# Western Australia
erp_state_1971_2024_wa_1 <-
as.data.table(read_excel("ABS data/TABLE 55. Estimated Resident Population By Single Year Of Age, Western Australia.xlsx",
sheet = "Data1"))
erp_state_1971_2024_wa_2 <-
as.data.table(read_excel("ABS data/TABLE 55. Estimated Resident Population By Single Year Of Age, Western Australia.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_wa <- merge(erp_state_1971_2024_wa_1, erp_state_1971_2024_wa_2,
by = names(erp_state_1971_2024_wa_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_wa[, STATE := "wa"]
# Delete the last 9 rows
erp_state_1971_2024_wa <- erp_state_1971_2024_wa[1:(dim(erp_state_1971_2024_wa)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_wa_1, erp_state_1971_2024_wa_2)
# Tasmania
erp_state_1971_2024_tas_1 <-
as.data.table(read_excel("ABS data/TABLE 56. Estimated Resident Population By Single Year Of Age, Tasmania.xlsx",
sheet = "Data1"))
erp_state_1971_2024_tas_2 <-
as.data.table(read_excel("ABS data/TABLE 56. Estimated Resident Population By Single Year Of Age, Tasmania.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_tas <- merge(erp_state_1971_2024_tas_1, erp_state_1971_2024_tas_2,
by = names(erp_state_1971_2024_tas_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_tas[, STATE := "tas"]
# Delete the last 9 rows
erp_state_1971_2024_tas <- erp_state_1971_2024_tas[1:(dim(erp_state_1971_2024_tas)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_tas_1, erp_state_1971_2024_tas_2)
# Northern Territory
erp_state_1971_2024_nt_1 <-
as.data.table(read_excel("ABS data/TABLE 57. Estimated Resident Population By Single Year Of Age, Northern Territory.xlsx",
sheet = "Data1"))
erp_state_1971_2024_nt_2 <-
as.data.table(read_excel("ABS data/TABLE 57. Estimated Resident Population By Single Year Of Age, Northern Territory.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_nt <- merge(erp_state_1971_2024_nt_1, erp_state_1971_2024_nt_2,
by = names(erp_state_1971_2024_nt_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_nt[, STATE := "nt"]
# Delete the last 9 rows
erp_state_1971_2024_nt <- erp_state_1971_2024_nt[1:(dim(erp_state_1971_2024_nt)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_nt_1, erp_state_1971_2024_nt_2)
# Australian Capital Territory
erp_state_1971_2024_act_1 <-
as.data.table(read_excel("ABS data/TABLE 58. Estimated Resident Population By Single Year Of Age, Australian Capital Territory.xlsx",
sheet = "Data1"))
erp_state_1971_2024_act_2 <-
as.data.table(read_excel("ABS data/TABLE 58. Estimated Resident Population By Single Year Of Age, Australian Capital Territory.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_act <- merge(erp_state_1971_2024_act_1, erp_state_1971_2024_act_2,
by = names(erp_state_1971_2024_act_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_act[, STATE := "act"]
# Delete the last 9 rows
erp_state_1971_2024_act <- erp_state_1971_2024_act[1:(dim(erp_state_1971_2024_act)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_act_1, erp_state_1971_2024_act_2)
# rbind all datasets
erp_state_1971_2024 <- rbind(erp_state_1971_2024_nsw
, erp_state_1971_2024_vic
, erp_state_1971_2024_qld
, erp_state_1971_2024_sa
, erp_state_1971_2024_wa
, erp_state_1971_2024_tas
, erp_state_1971_2024_nt
, erp_state_1971_2024_act)
# Remove individual objects
rm(erp_state_1971_2024_nsw
, erp_state_1971_2024_vic
, erp_state_1971_2024_qld
, erp_state_1971_2024_sa
, erp_state_1971_2024_wa
, erp_state_1971_2024_tas
, erp_state_1971_2024_nt
, erp_state_1971_2024_act)
# Change name of the first column to Year
setnames(erp_state_1971_2024,
names(erp_state_1971_2024)[1],
"Year")
# Format the Year column
erp_state_1971_2024[, Year := format(as.Date(as.numeric(Year), origin = "1899-12-30"), "%Y")]
# Remove all rows where Year is below 1995 or above 2023
erp_state_1995_2023 <- copy(erp_state_1971_2024)
erp_state_1995_2023 <- erp_state_1971_2024[Year >= 1995]
erp_state_1995_2023 <- erp_state_1995_2023[Year <= 2023]
# Function to clean and rename columns
convert_colnames <- function(names_vector) {
sapply(names_vector, function(name) {
# Remove any leading/trailing spaces
name <- str_trim(name)
# Ensure the column follows the expected pattern
if (!grepl("^Estimated Resident Population ;", name)) return(name)
# Split by " ; " while handling extra spaces
parts <- unlist(str_split(name, "\\s*;\\s*"))
# Ensure correct splitting
if (length(parts) < 3) return(name) # Skip unexpected formats
gender <- parts[2] # Extract gender
age <- parts[3] # Extract age
# Convert gender to lowercase initial (m/f/p)
gender_initial <- tolower(substr(gender, 1, 1))
# Process age values
age <- str_replace_all(age, " ", "_") # Replace spaces with underscores
# age <- ifelse(str_detect(age, "^[0-9]$"), paste0("0", age), age) # Add leading zero to single digits
age <- str_replace(age, "100_and_over", "100") # Ensure "100_and_over" is correct
# Return final column name
paste0(gender_initial, "_", age)
}, USE.NAMES = FALSE)
}
# Apply function to rename columns
setnames(erp_state_1995_2023,
names(erp_state_1995_2023),
convert_colnames(names(erp_state_1995_2023)))
# Convert all columns to numeric except STATE column
cols_to_convert <- setdiff(names(erp_state_1995_2023), "STATE")
erp_state_1995_2023[, (cols_to_convert) := lapply(.SD, as.numeric), .SDcols = cols_to_convert]
# Form age group categories similar to sa2 data
erp_state_1995_2023[, `m_00_24` := rowSums(.SD[, paste0("m_", 0:24)])]
erp_state_1995_2023[, `m_25_44` := rowSums(.SD[, paste0("m_", 25:44)])]
erp_state_1995_2023[, `m_45_64` := rowSums(.SD[, paste0("m_", 45:64)])]
erp_state_1995_2023[, `m_65_74` := rowSums(.SD[, paste0("m_", 65:74)])]
erp_state_1995_2023[, `m_75_84` := rowSums(.SD[, paste0("m_", 75:84)])]
erp_state_1995_2023[, `m_85_99` := rowSums(.SD[, paste0("m_", 85:100)])]
erp_state_1995_2023[, `m_all` := rowSums(.SD[, paste0("m_", 0:100)])]
erp_state_1995_2023[, `f_00_24` := rowSums(.SD[, paste0("f_", 0:24)])]
erp_state_1995_2023[, `f_25_44` := rowSums(.SD[, paste0("f_", 25:44)])]
erp_state_1995_2023[, `f_45_64` := rowSums(.SD[, paste0("f_", 45:64)])]
erp_state_1995_2023[, `f_65_74` := rowSums(.SD[, paste0("f_", 65:74)])]
erp_state_1995_2023[, `f_75_84` := rowSums(.SD[, paste0("f_", 75:84)])]
erp_state_1995_2023[, `f_85_99` := rowSums(.SD[, paste0("f_", 85:100)])]
erp_state_1995_2023[, `f_all` := rowSums(.SD[, paste0("f_", 0:100)])]
erp_state_1995_2023[, `p_00_24` := rowSums(.SD[, paste0("p_", 0:24)])]
erp_state_1995_2023[, `p_25_44` := rowSums(.SD[, paste0("p_", 25:44)])]
erp_state_1995_2023[, `p_45_64` := rowSums(.SD[, paste0("p_", 45:64)])]
erp_state_1995_2023[, `p_65_74` := rowSums(.SD[, paste0("p_", 65:74)])]
erp_state_1995_2023[, `p_75_84` := rowSums(.SD[, paste0("p_", 75:84)])]
erp_state_1995_2023[, `p_85_99` := rowSums(.SD[, paste0("p_", 85:100)])]
erp_state_1995_2023[, `p_all` := rowSums(.SD[, paste0("p_", 0:100)])]
# Remove individual age groups
erp_state_1995_2023[, c(paste0("m_", 0:100), paste0("f_", 0:100), paste0("p_", 0:100)) := NULL]
# Remove temporary objects
rm(convert_colnames, cols_to_convert)
# Plot the differences in these two datasets for all years, age groups and sex
erp_backcasting_agg <- copy(erp_backcasting)
erp_backcasting_agg <- erp_backcasting_agg[, `SA2_NAME` := NULL]
erp_backcasting_agg <- erp_backcasting_agg[, lapply(.SD, sum), by = Year]
erp_backcasting_agg <- melt(erp_backcasting_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# add a colour column where value is #f00 when Category column value contains f and #0f0 when the value contains m and #00f when p
erp_backcasting_agg[, color := ifelse(grepl("f", Category), "#f00", ifelse(grepl("m", Category), "#00f", "#0f0"))]
# erp_backcasting_agg <- erp_backcasting_agg[color == "#0f0"]
erp_state_1995_2023_agg <- copy(erp_state_1995_2023)
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, STATE := NULL]
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, lapply(.SD, sum), by = Year]
erp_state_1995_2023_agg <- melt(erp_state_1995_2023_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# add a colour column where value is #f00 when Category column value contains f and #0f0 when the value contains m and #00f when p
erp_state_1995_2023_agg[, color := ifelse(grepl("f", Category), "#f00", ifelse(grepl("m", Category), "#00f", "#0f0"))]
# erp_state_1995_2023_agg <- erp_state_1995_2023_agg[color == "#0f0"]
plot_ly(data = erp_backcasting_agg, x = ~Year, y = ~Value, color = ~Category,
line = list(color = adjust_transparency(erp_backcasting_agg$color, 0.5), width = 1),
type = "scatter", mode = "lines") %>%
add_trace(data = erp_state_1995_2023_agg, x = ~Year, y = ~Value, color = ~Category, line = list(color = adjust_transparency(erp_state_1995_2023_agg$color, 1), width = 2),
type = "scatter", mode = "lines") %>%
layout(title = "Trends Over Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"))
# Checking all the ratios and making a heatmap
erp_backcasting_agg <- copy(erp_backcasting)
erp_backcasting_agg <- erp_backcasting_agg[, `SA2_NAME` := NULL]
erp_backcasting_agg <- erp_backcasting_agg[, lapply(.SD, sum), by = Year]
erp_state_1995_2023_agg <- copy(erp_state_1995_2023)
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, STATE := NULL]
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, lapply(.SD, sum), by = Year]
# Compute ratios (excluding the Year column)
ratio_table <- erp_state_1995_2023_agg[, lapply(.SD, as.numeric)] / erp_backcasting_agg[, lapply(.SD, as.numeric)]
ratio_table[, Year := erp_backcasting_agg$Year] # Restore Year column
# Convert to long format for ggplot
ratio_long <- melt(ratio_table, id.vars = "Year", variable.name = "Category", value.name = "Ratio")
# ratio_long <- ratio_long[Year <= 2000] # How well extrapolated estimates align with ABS state level estimates?
# Plot heatmap
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "How well extrapolated estimates align with ABS state level estimates?",
x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'How well extrapolated estimates align with ABS state level estimates'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Australia state-level:SA2-level estimates for 1995-2023", y = "Ratio", x = "Year")) %>%
layout(yaxis = list(range = c(0.8, 1.1), tickvals = list(0.8, 0.9, 1, 1.1), ticktext = list(0.8, 0.9, 1, 1.1))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Australia state-level:SA2-level estimates for 1995-2023'))
ratio_long <- melt(ratio_table, id.vars = "Year", variable.name = "Category", value.name = "Ratio")
ratio_long <- ratio_long[Year >= 2001] # How well ABS estimates align with each other?
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "How well ABS estimates align with each other?", x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'How well ABS estimates align with each other'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Australia state-level:SA2-level estimates for 2001-2023 (zoomed)",
y = "Ratio", x = "Year")) %>%
# layout(yaxis = list(range = c(0.8, 1.1), tickvals = list(0.8, 0.9, 1, 1.1), ticktext = list(0.8, 0.9, 1, 1.1))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Australia state-level:SA2-level estimates for 2001-2023 (zoomed)'))
rm(erp_backcasting_agg, erp_state_1995_2023_agg, ratio_table, ratio_long)
# Add a new column SA2_CODE to erp_backcasting using mapping with SA2_NAME column of erp_sa2_2001_2023_asgs2021_m
erp_sa2_2001_2023_asgs2021_m <- as.data.table(read_excel("ABS data/erp_sa2_2001_2023_asgs2021.xlsx",
sheet = "Table 1", skip = 6))
setnames(erp_sa2_2001_2023_asgs2021_m, old = "SA2 code", new = "SA2_CODE")
setnames(erp_sa2_2001_2023_asgs2021_m, old = "SA2 name", new = "SA2_NAME")
erp_sa2_2001_2023_asgs2021_m <- erp_sa2_2001_2023_asgs2021_m[,.(SA2_CODE, SA2_NAME)]
erp_sa2_2001_2023_asgs2021_m <- unique(erp_sa2_2001_2023_asgs2021_m)
erp_backcasting <- merge(erp_backcasting, erp_sa2_2001_2023_asgs2021_m, by = "SA2_NAME", all.x = TRUE)
rm(erp_sa2_2001_2023_asgs2021_m)
# Form STATE column using SA2_CODE
erp_backcasting[, `SA2_CODE_1` := substr(SA2_CODE, 1, 1)]
erp_backcasting[, `SA2_CODE_1` := as.numeric(SA2_CODE_1)]
erp_backcasting <- erp_backcasting[SA2_CODE_1 != 9] # Removing all Other Territories as their data is not available from state level
erp_backcasting[, STATE := fcase(SA2_CODE_1 == 1, "nsw",
SA2_CODE_1 == 2, "vic",
SA2_CODE_1 == 3, "qld",
SA2_CODE_1 == 4, "sa",
SA2_CODE_1 == 5, "wa",
SA2_CODE_1 == 6, "tas",
SA2_CODE_1 == 7, "nt",
SA2_CODE_1 == 8, "act")]
erp_backcasting[,SA2_CODE := NULL]
erp_backcasting[,SA2_CODE_1 := NULL]
# Form columns that contains sums of age group column values for each year and STATE column without considering SA2_NAME column
# Summarise by Year and STATE
# Select only the required numeric columns
numeric_cols <- grep("^m_|^f_|^p_", names(erp_backcasting), value = TRUE)
# Perform aggregation
erp_summarised <- erp_backcasting[, lapply(.SD, sum, na.rm = TRUE), by =
.(Year, STATE), .SDcols = numeric_cols]
# Rename columns by appending "_sum"
setnames(erp_summarised, old = names(erp_summarised)[-c(1,2)],
new = paste0(names(erp_summarised)[-c(1,2)], "_sum"))
# Merge erp_summarised with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_summarised, by = c("Year", "STATE"), all.x = TRUE)
# Rename columns in state by appending "_abs"
setnames(erp_state_1995_2023, old = names(erp_state_1995_2023)[-c(1,2)],
new = paste0(names(erp_state_1995_2023)[-c(1,2)], "_abs"))
# Merge erp_state_1995_2023 with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_state_1995_2023, by = c("Year", "STATE"), all.x = TRUE)
# Identify the sum and abs columns
sum_cols <- grep("_sum$", names(erp_backcasting), value = TRUE)
abs_cols <- sub("_sum$", "_abs", sum_cols) # Get corresponding _abs columns
# Ensure all abs_cols exist before dividing
valid_cols <- abs_cols %in% names(erp_backcasting)
# Create ratio columns
for (i in which(valid_cols)) {
ratio_col <- sub("_sum$", "_ratio", sum_cols[i]) # Define new column name
erp_backcasting[, (ratio_col) := get(abs_cols[i]) / get(sum_cols[i])]
}
# Make all _ratio column values 1 for Year column values 1995:2000
# Identify all _ratio columns
ratio_cols <- grep("_ratio$", names(erp_backcasting), value = TRUE)
# Set _ratio values to 1 for Year between 1995 and 2000
erp_backcasting[Year <= 2000][, (ratio_cols) := 1]
# Visualise the ratios
# make a new dt with only c("Year", "STATE", ratio_cols) columns of erp_backcasting
ratio_table <- erp_backcasting[, c("Year", "STATE", ratio_cols), with = FALSE]
ratio_table <- unique(ratio_table)
# Convert to long format for ggplot
ratio_long <- melt(ratio_table, id.vars = c("Year", "STATE"), variable.name = "Category", value.name = "Ratio")
# ratio_long <- ratio_long[STATE == "nsw"]
# ratio_long <- ratio_long[STATE == "vic"]
ratio_long <- ratio_long[STATE == "qld"]
# ratio_long <- ratio_long[STATE == "sa"]
# ratio_long <- ratio_long[STATE == "wa"]
# ratio_long <- ratio_long[STATE == "tas"]
# ratio_long <- ratio_long[STATE == "nt"]
# ratio_long <- ratio_long[STATE == "act"]
# Plot heatmap
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "Heatmap of Ratios after adjustment (non-round numbers)",
x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Heatmap of Ratios after adjustment (non-round numbers)'))
# Multiply numeric_cols with their respective _ratio columns to form _adjusted columns
adjusted_cols <- sub("_ratio$", "_adjusted", ratio_cols)
erp_backcasting[, (adjusted_cols) := Map(`*`, .SD, mget(ratio_cols)), .SDcols = numeric_cols]
# Not rounding the numbers here as they will be done in the next step
# Remove temporary objects
rm(abs_cols, adjusted_cols, i, numeric_cols, ratio_long, ratio_table, ratio_col, ratio_cols, sum_cols, valid_cols)
# all age groups collectively should match m_all, f_all, or p_all
# males and females of each age group should match p_all
correct_population_sums <- function(data, columns_to_fix, column_to_refer) {
# Calculate row-wise sum of selected columns
data[, sum := rowSums(.SD, na.rm = TRUE), .SDcols = columns_to_fix]
# Compute ratio, ensuring no division by zero
data[, ratio := fifelse(get(column_to_refer) == 0, 1, sum / get(column_to_refer))]
# Handle cases where total population is zero
data[get(column_to_refer) == 0, (columns_to_fix) := 0]
data[get(column_to_refer) == 0, ratio := 1]
# Divide each selected column by ratio and round to the nearest integer
data[, (columns_to_fix) := lapply(.SD, function(x) round(x / ratio)), .SDcols = columns_to_fix]
# Calculate new sum and ratio after adjustments
data[, sum_new := rowSums(.SD, na.rm = TRUE), .SDcols = columns_to_fix]
data[, ratio_new := fifelse(get(column_to_refer) == 0, 1, sum_new / get(column_to_refer))]
# Replace Inf and NaN values before plotting
data[is.infinite(ratio_new) | is.na(ratio_new), ratio_new := 1]
# Only plot if there are valid values to plot
deviations <- data[ratio_new < 0.99 | ratio_new > 1.01]$ratio_new
if (length(deviations) > 0) {
plot(deviations, main = "Deviations in Ratio", ylab = "Ratio New", xlab = "Index")
} else {
print("No significant deviations found.")
}
# Count number of affected rows
bad_rows <- nrow(data[ratio_new < 0.99 | ratio_new > 1.01])
# Print deviation statistics
print(paste0("Number of >1% deviations used to update referral column: ", bad_rows))
print(paste0("This is ", format((bad_rows / nrow(data)) * 100, digits = 3), "% of ", nrow(data), " rows"))
if (bad_rows > 0) {
print(paste0("With a mean of ", format(mean(deviations, na.rm = TRUE), digits = 3)))
}
# Adjust 'all' column based on new sum
data[, (column_to_refer) := sum_new]
# Replace any remaining NaN values with 0
data[is.na(data)] <- 0
# Remove temporary columns
data[, c("sum", "ratio", "sum_new", "ratio_new") := NULL]
return(data)
}
# Step 1: Make sure p_all = all age groups
erp_backcasting <- correct_population_sums(
data = erp_backcasting,
columns_to_fix = paste0("p_", c("00_24", "25_44", "45_64", "65_74", "75_84", "85_99"), "_adjusted"),
column_to_refer = "p_all_adjusted"
)
[1] "Number of >1% deviations used to update referral column: 253"
[1] "This is 0.356% of 71050 rows"
[1] "With a mean of 0.996"
# Step 2: m_all + f_all should be made == p_all
erp_backcasting <- correct_population_sums(
data = erp_backcasting,
columns_to_fix = c("m_all_adjusted", "f_all_adjusted"),
column_to_refer = "p_all_adjusted"
)
[1] "Number of >1% deviations used to update referral column: 1"
[1] "This is 0.00141% of 71050 rows"
[1] "With a mean of 0"
# Step 3: Individual age groups of each sex should be made equal to sum of all age groups of the respective sex
erp_backcasting <- correct_population_sums(
data = erp_backcasting,
columns_to_fix = paste0("m_", c("00_24", "25_44", "45_64", "65_74", "75_84", "85_99"), "_adjusted"),
column_to_refer = "m_all_adjusted"
)
[1] "Number of >1% deviations used to update referral column: 147"
[1] "This is 0.207% of 71050 rows"
[1] "With a mean of 0.983"
[1] "Number of >1% deviations used to update referral column: 154"
[1] "This is 0.217% of 71050 rows"
[1] "With a mean of 0.997"
# Step 4: Make sure male and female in each age group = person of that age group
# Now that all major revisions have been made, we can change age groups of persons based on sum of age groups of sex
erp_backcasting[, p_00_24_adjusted := rowSums(.SD), .SDcols = c("m_00_24_adjusted", "f_00_24_adjusted")]
erp_backcasting[, p_25_44_adjusted := rowSums(.SD), .SDcols = c("m_25_44_adjusted", "f_25_44_adjusted")]
erp_backcasting[, p_45_64_adjusted := rowSums(.SD), .SDcols = c("m_45_64_adjusted", "f_45_64_adjusted")]
erp_backcasting[, p_65_74_adjusted := rowSums(.SD), .SDcols = c("m_65_74_adjusted", "f_65_74_adjusted")]
erp_backcasting[, p_75_84_adjusted := rowSums(.SD), .SDcols = c("m_75_84_adjusted", "f_75_84_adjusted")]
erp_backcasting[, p_85_99_adjusted := rowSums(.SD), .SDcols = c("m_85_99_adjusted", "f_85_99_adjusted")]
erp_backcasting[, p_all_adjusted := rowSums(.SD), .SDcols = c("m_all_adjusted", "f_all_adjusted")]
# Remove all columns except Year, STATE, SA2_NAME and _adjust columns
cols_to_remove <- setdiff(names(erp_backcasting),
c("Year", "STATE", "SA2_NAME",
grep("_adjust", names(erp_backcasting), value = TRUE)))
erp_backcasting[, (cols_to_remove) := NULL]
# Remove all _adjusted at the end of column names
setnames(erp_backcasting,
old = grep("_adjusted$", names(erp_backcasting), value = TRUE),
new = gsub("_adjusted$", "", grep("_adjusted$", names(erp_backcasting), value = TRUE)))
rm(correct_population_sums, cols_to_remove)
b_erp_backcasting <- copy(erp_backcasting)
# Recalculate like we did when we tethered the historical population the first time
# Form columns that contains sums of age group column values for each year and STATE column without considering SA2_NAME column
# Summarise by Year and STATE
# Select only the required numeric columns
numeric_cols <- grep("^m_|^f_|^p_", names(erp_backcasting), value = TRUE)
# Perform aggregation
erp_summarised <- erp_backcasting[, lapply(.SD, sum, na.rm = TRUE), by =
.(Year, STATE), .SDcols = numeric_cols]
# Rename columns by appending "_sum"
setnames(erp_summarised, old = names(erp_summarised)[-c(1,2)],
new = paste0(names(erp_summarised)[-c(1,2)], "_sum"))
# Merge erp_summarised with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_summarised, by = c("Year", "STATE"), all.x = TRUE)
# Merge erp_state_1995_2023 with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_state_1995_2023, by = c("Year", "STATE"), all.x = TRUE)
# Identify the sum and abs columns
sum_cols <- grep("_sum$", names(erp_backcasting), value = TRUE)
abs_cols <- sub("_sum$", "_abs", sum_cols) # Get corresponding _abs columns
# Ensure all abs_cols exist before dividing
valid_cols <- abs_cols %in% names(erp_backcasting)
# Create ratio columns
for (i in which(valid_cols)) {
ratio_col <- sub("_sum$", "_ratio", sum_cols[i]) # Define new column name
erp_backcasting[, (ratio_col) := get(abs_cols[i]) / get(sum_cols[i])]
}
# Make all _ratio column values 1 for Year column values 1995:2000
# Identify all _ratio columns
ratio_cols <- grep("_ratio$", names(erp_backcasting), value = TRUE)
# Set _ratio values to 1 for Year between 1995 and 2000
erp_backcasting[Year <= 2000][, (ratio_cols) := 1]
# Visualise the ratios
# make a new dt with only c("Year", "STATE", ratio_cols) columns of erp_backcasting
ratio_table <- erp_backcasting[, c("Year", "STATE", ratio_cols), with = FALSE]
ratio_table <- unique(ratio_table)
# Convert to long format for ggplot
ratio_long <- melt(ratio_table, id.vars = c("Year", "STATE"), variable.name = "Category", value.name = "Ratio")
# ratio_long <- ratio_long[STATE == "nsw"]
# ratio_long <- ratio_long[STATE == "vic"]
ratio_long <- ratio_long[STATE == "qld"]
# ratio_long <- ratio_long[STATE == "sa"]
# ratio_long <- ratio_long[STATE == "wa"]
# ratio_long <- ratio_long[STATE == "tas"]
# ratio_long <- ratio_long[STATE == "nt"]
# ratio_long <- ratio_long[STATE == "act"]
# Plot heatmap
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "How well extrapolated estimates align with ABS state level estimates after adjustments?",
x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'How well extrapolated estimates align with ABS state level estimates after adjustments'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Queensland state-level:SA2-level estimates for 1995-2023",
y = "Ratio", x = "Year")) %>%
layout(yaxis = list(range = c(0.8, 1.1), tickvals = list(0.8, 0.9, 1, 1.1), ticktext = list(0.8, 0.9, 1, 1.1))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Queensland state-level:SA2-level estimates for 1995-2023'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Queensland state-level:SA2-level estimates for 1995-2023 (zoomed)",
y = "Ratio", x = "Year")) %>%
# layout(yaxis = list(range = c(0.8, 1.1), tickvals = list(0.8, 0.9, 1, 1.1), ticktext = list(0.8, 0.9, 1, 1.1))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Queensland state-level:SA2-level estimates for 1995-2023 (zoomed)'))
erp_sa2_1995_2023_asgs2021 <- copy(erp_backcasting)
rm(erp_backcasting)
write.csv(erp_sa2_1995_2023_asgs2021, "Population estimates data/For cardiovascular diseases/erp_sa2_1995_2023_asgs2021.csv", row.names = FALSE)
# Remove all non-qld data as SA1 level estimates are not available for them
erp_sa2_1995_2023_asgs2021_qld <- erp_sa2_1995_2023_asgs2021[STATE == "qld"]
# Remove STATE column
erp_sa2_1995_2023_asgs2021_qld[, STATE := NULL]
erp_sa2_proportions <- copy(erp_sa2_1995_2023_asgs2021_qld)
# Identify the 2011 baseline
baseline <- erp_sa2_proportions[Year == 2011, .SD, .SDcols = c("SA2_NAME", names(erp_sa2_proportions)[3:23])]
# Replace 0 with 0.5 to prevent division by zero
baseline[, (names(baseline)[-1]) := lapply(.SD, function(x) fifelse(x == 0, 0.5, x)), .SDcols = -1]
# Rename columns for merging
setnames(baseline, old = names(baseline)[-1], new = paste0("base_", names(baseline)[-1]))
# Merge baseline with the full dataset
erp_sa2_proportions <- merge(erp_sa2_proportions, baseline, by = "SA2_NAME", all.x = TRUE)
# Compute proportions (divide each column by its corresponding 2011 baseline)
for (col in names(erp_sa2_proportions)[3:23]) {
base_col <- paste0("base_", col)
erp_sa2_proportions[, (col) := get(col) / get(base_col)]
}
# Drop baseline columns
erp_sa2_proportions[, (grep("^base_", names(erp_sa2_proportions), value = TRUE)) := NULL]
# Handling areas that had 0 population
# Identify ratio columns
ratio_cols <- setdiff(names(erp_sa2_proportions), c("SA2_NAME", "Year"))
# Find SA2s where at least one ratio column is 0 in 2011
zero_cols_2011 <- erp_sa2_proportions[Year == 2011, lapply(.SD, function(x) x == 0), .SDcols = ratio_cols]
sa2_to_modify <- erp_sa2_proportions[Year == 2011][, `SA2_NAME`][rowSums(zero_cols_2011) > 0]
# For each column, only divide by 2 if it was 0 in 2011
for (col in ratio_cols) {
sa2_with_zero <- erp_sa2_proportions[Year == 2011 & get(col) == 0, unique(`SA2_NAME`)]
# Only divide this column for the affected SA2s
erp_sa2_proportions[`SA2_NAME` %in% sa2_with_zero, (col) := get(col) / 2]
# Replace 0 with 1 in 2011 for affected SA2s
# erp_sa2_proportions[Year == 2011 & `SA2_NAME` %in% sa2_with_zero, (col) := fifelse(get(col) == 0, 1, get(col))]
## Replacing 0 with 1 can ruin aggregate results and disrupt population distribution
## Areas with starting ratio as 0 does not matter as all contained SA1s will also have 0 population in 2011
## Not doing this is the right thing
## This means that the ratio of all years will be equal to population counts for areas with 0 pop in 2011
}
# Remove temporary objects
rm(baseline, base_col, col, ratio_cols, zero_cols_2011, sa2_to_modify, sa2_with_zero)
# Saving a backup copy before processing in next steps
b_erp_sa2_proportions <- copy(erp_sa2_proportions)
| Data Property | Specifications |
|---|---|
| Age | 1-year |
| Sex | Male, female |
| Spatial scope | Queensland |
| Spatial resolution | SA1 |
| Temporal scope | 2011 - 2023 |
| Temporal resolution | Yearly |
| Reference | Australian Bureau of Statistics. (2024). Estimated Resident Population, Customised Report. ABS. https://www.qgso.qld.gov.au/statistics/theme/population/population-estimates/regions. |
# Load SA1 ERPs
# The original table was broken into two as the file size would otherwise be greater than 100mb, which is the limit of GitHub
erp_sa1_2001_2023_asgs2021_table_1_first_100000_rows <- as.data.table(read_excel("ABS data/erp_sa1_2001_2023_asgs2021_table_1_first_100000_rows.xlsx",
sheet = "Table 1", skip = 4))
erp_sa1_2001_2023_asgs2021_table_1_after_100000_rows <- as.data.table(read_excel("ABS data/erp_sa1_2001_2023_asgs2021_table_1_after_100000_rows.xlsx",
sheet = "Table 1", skip = 4))
erp_sa1_2011_2023_asgs2021 <- rbind(erp_sa1_2001_2023_asgs2021_table_1_first_100000_rows,
erp_sa1_2001_2023_asgs2021_table_1_after_100000_rows)
rm(erp_sa1_2001_2023_asgs2021_table_1_first_100000_rows,
erp_sa1_2001_2023_asgs2021_table_1_after_100000_rows)
erp_sa1_2011_2023_asgs2021 <- erp_sa1_2011_2023_asgs2021[!is.na(Year)]
# Rename SA1 code to SA1_CODE for consistency with other boundary datasets
setnames(erp_sa1_2011_2023_asgs2021, old = "SA1 code", new = "SA1_CODE")
setnames(erp_sa1_2011_2023_asgs2021, old = "SA2 code", new = "SA2_CODE")
setnames(erp_sa1_2011_2023_asgs2021, old = "SA2 name", new = "SA2_NAME")
# Check if every SA2_NAME from erp_sa1_2011_2023_asgs2021 is also present in erp_sa2_2001_2023_asgs2021
x <- erp_sa1_2011_2023_asgs2021[!erp_sa1_2011_2023_asgs2021$`SA2_NAME` %in% erp_sa2_2001_2023_asgs2021$`SA2_NAME`]
x <- unique(x, by = "SA2_NAME")
# Everything qld SA2_CODE mathces perfectly as it should
rm(x)
# Form age group categories similar to sa2 data
erp_sa1_2011_2023_asgs2021[, `m_00_24` := rowSums(.SD[, paste0("m", 0:24)])]
erp_sa1_2011_2023_asgs2021[, `m_25_44` := rowSums(.SD[, paste0("m", 25:44)])]
erp_sa1_2011_2023_asgs2021[, `m_45_64` := rowSums(.SD[, paste0("m", 45:64)])]
erp_sa1_2011_2023_asgs2021[, `m_65_74` := rowSums(.SD[, paste0("m", 65:74)])]
erp_sa1_2011_2023_asgs2021[, `m_75_84` := rowSums(.SD[, paste0("m", 75:84)])]
erp_sa1_2011_2023_asgs2021[, `m_85_99` := rowSums(.SD[, paste0("m85+")])]
erp_sa1_2011_2023_asgs2021[, `m_all` := rowSums(.SD[, c(paste0("m", 0:84), "m85+")])]
erp_sa1_2011_2023_asgs2021[, `f_00_24` := rowSums(.SD[, paste0("f", 0:24)])]
erp_sa1_2011_2023_asgs2021[, `f_25_44` := rowSums(.SD[, paste0("f", 25:44)])]
erp_sa1_2011_2023_asgs2021[, `f_45_64` := rowSums(.SD[, paste0("f", 45:64)])]
erp_sa1_2011_2023_asgs2021[, `f_65_74` := rowSums(.SD[, paste0("f", 65:74)])]
erp_sa1_2011_2023_asgs2021[, `f_75_84` := rowSums(.SD[, paste0("f", 75:84)])]
erp_sa1_2011_2023_asgs2021[, `f_85_99` := rowSums(.SD[, paste0("f85+")])]
erp_sa1_2011_2023_asgs2021[, `f_all` := rowSums(.SD[, c(paste0("f", 0:84), "f85+")])]
erp_sa1_2011_2023_asgs2021[, `p_00_24` := rowSums(.SD[, c(paste0("m", 0:24), paste0("f", 0:24))])]
erp_sa1_2011_2023_asgs2021[, `p_25_44` := rowSums(.SD[, c(paste0("m", 25:44), paste0("f", 25:44))])]
erp_sa1_2011_2023_asgs2021[, `p_45_64` := rowSums(.SD[, c(paste0("m", 45:64), paste0("f", 45:64))])]
erp_sa1_2011_2023_asgs2021[, `p_65_74` := rowSums(.SD[, c(paste0("m", 65:74), paste0("f", 65:74))])]
erp_sa1_2011_2023_asgs2021[, `p_75_84` := rowSums(.SD[, c(paste0("m", 75:84), paste0("f", 75:84))])]
erp_sa1_2011_2023_asgs2021[, `p_85_99` := rowSums(.SD[, c(paste0("m85+"), paste0("f85+"))])]
erp_sa1_2011_2023_asgs2021[, `p_all` := rowSums(.SD[, c(paste0("m", 0:84), "m85+", paste0("f", 0:84), "f85+")])]
# Remove individual age groups
erp_sa1_2011_2023_asgs2021[, c(paste0("m", 0:84), "m85+", paste0("f", 0:84), "f85+", "total") := NULL]
# Extend population counts in all age group columns of erp_sa1_2011_2023_asgs2021
# from years 1995 to 2010 based on 2021 year population counts using ratios available
# in erp_sa2_proportions on match with SA2_NAME and Year
erp_sa1 <- copy(erp_sa1_2011_2023_asgs2021)
# Step 1: Filter `erp_sa2_proportions` for years 1995-2010
erp_sa2_proportions <- erp_sa2_proportions[Year >= 1995 & Year <= 2010]
# Step 2: Filter `erp_sa1_2011_2023_asgs2021` for Year = 2011 (to use as baseline)
erp_sa1_2011 <- erp_sa1[Year == 2011]
# Step 3: Merge datasets on "SA2_NAME"
erp_extended <- merge(
erp_sa2_proportions,
erp_sa1_2011,
by = "SA2_NAME",
suffixes = c("_ratio", "_count"),
allow.cartesian = TRUE
)
# Step 4: Multiply 2011 counts by backcasting ratios to estimate past population counts
# Select all colnames of erp_extended ending in _count
age_groups <- grep("_count$", names(erp_extended), value = TRUE)
# Remove "Year_count" from age_groups
age_groups <- setdiff(age_groups, "Year_count")
for (age in age_groups) {
ratio_col <- sub("_count$", "_ratio", age) # Convert _count to _ratio
erp_extended[, (age) := get(age) * get(ratio_col)]
}
# Not rounding the numbers here as it will not serve any purpose because we will not be using SA1 level data
# It will only have downside by inducing rounding errors
# Step 5: Keep necessary columns and rename for consistency
erp_extended <- erp_extended[, .(Year_ratio, `SA1_CODE`, `SA2_CODE`, `SA2_NAME`, .SD), .SDcols = age_groups]
# Make all erp_extended column names same as erp_sa1
setnames(erp_extended, old = names(erp_extended), new = names(erp_sa1))
# Step 6: Append to the existing `erp_sa1_2011_2023_asgs2021`
erp_sa1_1995_2023_asgs2021 <- rbind(erp_sa1, erp_extended, fill = TRUE)
# Step 7: Order the data for consistency
setorder(erp_sa1_1995_2023_asgs2021, `SA1_CODE`, Year)
# Remove temporary objects
rm(erp_sa1, erp_sa1_2011, erp_extended, age_groups, age, ratio_col, erp_sa2_proportions, b_erp_sa2_proportions)
dt <- copy(erp_sa1_1995_2023_asgs2021)
dt[, `SA1_CODE` := as.character(`SA1_CODE`)]
# Subset data for 20 random SA2_NAMEs
set.seed(12345)
sa2_names <- sample(unique(dt$`SA2_NAME`), 1)
dt <- dt[`SA2_NAME` %in% sa2_names]
# dt <- dt[grepl("brisbane city", `SA2_NAME`, ignore.case = T)]
plot_ly(
data = dt,
x = ~Year,
# y = ~m_all,
# y = ~f_all,
y = ~p_all,
color = ~`SA1_CODE`, # Group by SA2_NAME
type = 'scatter',
mode = 'lines', # Line plot
hoverinfo = 'text',
text = ~paste("Year:", Year
, "<br>SA1:", `SA1_CODE`
, "<br>p_all:", p_all)
) %>%
layout(
title = "Population (p_all) Over Time by SA1",
xaxis = list(title = "Year"),
yaxis = list(title = "p_all"),
legend = list(title = list(text = "SA2_CODE"))
)
# Aggregate by year column and sum all columns disregarding SA2_NAME
dt_agg <- copy(erp_sa1_1995_2023_asgs2021)
# dt_agg <- dt_agg[, `SA2_NAME` := NULL]
dt_agg <- dt_agg[, c("SA1_CODE", "SA2_CODE", "SA2_NAME") := NULL]
dt_agg <- dt_agg[, lapply(.SD, sum), by = Year]
dt_agg <- melt(dt_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# Plot using Plotly
plot_ly(data = dt_agg, x = ~Year, y = ~Value, color = ~Category,
type = "scatter", mode = "lines") %>%
layout(title = "Trends Over Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"))
# Remove temporary objects
rm(dt, dt_agg, sa2_names)
# Looks good
# There are some SA1s within an SA2 whose trends differ in 1995:2010 than in 2011:2023
# Most follow plausible trends
# But when looking at aggregated data, they looks as they should
# This is as expected - there will be some different trends in different areas within an SA2
Correspondence for converting SA1 ASGS2021 to SAL (suburbs) 2021 is available from:
Australian Bureau of Statistics. (2021). ASGS Geographic Correspondences (2021) Edition 3. ABS. https://data.gov.au/data/dataset/asgs-edition-3-2021-correspondences.
# Load SA1 2021 to SAL 2021 correspondence data
cg_sa1_2021_sal_2021 <- as.data.table(read.csv('ABS data/CG_SA1_2021_SAL_2021.csv'))
# Convert "SA1_CODE_2021" and "SAL_CODE_2021" to numeric
cg_sa1_2021_sal_2021[, `SA1_CODE_2021` := as.numeric(`SA1_CODE_2021`)]
cg_sa1_2021_sal_2021[, `SAL_CODE_2021` := as.numeric(`SAL_CODE_2021`)]
# Subset rows where SA1_CODE_2021 starts with 3 [Queensland SA1s]
cg_sa1_2021_sal_2021 <- cg_sa1_2021_sal_2021[grepl("^3", `SA1_CODE_2021`)]
# Inspecting the quality of correspondence
plot(table(cg_sa1_2021_sal_2021$INDIV_TO_REGION_QLTY_INDICATOR))
# Ensure column names match
setnames(cg_sa1_2021_sal_2021, "SA1_CODE_2021", "SA1_CODE")
# Merge with allow.cartesian=TRUE to handle one-to-many joins
merged_data <- merge(erp_sa1_1995_2023_asgs2021,
cg_sa1_2021_sal_2021[, .(`SA1_CODE`, SAL_CODE_2021, SAL_NAME_2021, RATIO_FROM_TO)],
by = "SA1_CODE", all.x = TRUE, allow.cartesian = TRUE)
# Identify population columns
pop_cols <- setdiff(names(erp_sa1_1995_2023_asgs2021), c("Year", "SA1_CODE", "SA2_CODE", "SA2_NAME"))
# Scale population estimates by RATIO_FROM_TO
for (col in pop_cols) {
merged_data[, (col) := get(col) * RATIO_FROM_TO]
}
# Aggregate to suburb level
erp_suburb_1995_2023_asgs2021 <- merged_data[, lapply(.SD, sum, na.rm = TRUE),
by = .(Year, SAL_CODE_2021, SAL_NAME_2021), .SDcols = pop_cols]
# Round population to nearest integer
for (col in pop_cols) {
erp_suburb_1995_2023_asgs2021[, (col) := round(get(col))]
}
# Making sure totals make sense
# all age groups collectively should match m_all, f_all, or p_all
# males and females of each age group should match p_all
# This time, instead of forcing all age groups to match with p_all (top down approach),
# a bottom up approach will be implemented forcing p_all to be the sum of all age groups.
# Because, for SA2s we had the state population as a benchmark, whereas we are using correspondence data this time.
erp_suburb_1995_2023_asgs2021[, m_all := rowSums(.SD), .SDcols = paste0("m_", c("00_24", "25_44", "45_64", "65_74", "75_84", "85_99"))]
erp_suburb_1995_2023_asgs2021[, f_all := rowSums(.SD), .SDcols = paste0("f_", c("00_24", "25_44", "45_64", "65_74", "75_84", "85_99"))]
erp_suburb_1995_2023_asgs2021[, p_00_24 := rowSums(.SD), .SDcols = c("m_00_24", "f_00_24")]
erp_suburb_1995_2023_asgs2021[, p_25_44 := rowSums(.SD), .SDcols = c("m_25_44", "f_25_44")]
erp_suburb_1995_2023_asgs2021[, p_45_64 := rowSums(.SD), .SDcols = c("m_45_64", "f_45_64")]
erp_suburb_1995_2023_asgs2021[, p_65_74 := rowSums(.SD), .SDcols = c("m_65_74", "f_65_74")]
erp_suburb_1995_2023_asgs2021[, p_75_84 := rowSums(.SD), .SDcols = c("m_75_84", "f_75_84")]
erp_suburb_1995_2023_asgs2021[, p_85_99 := rowSums(.SD), .SDcols = c("m_85_99", "f_85_99")]
erp_suburb_1995_2023_asgs2021[, p_all := rowSums(.SD), .SDcols = c("m_all", "f_all")]
# Rename columns for clarity
setnames(erp_suburb_1995_2023_asgs2021, c("SAL_CODE_2021", "SAL_NAME_2021"), c("SUBURB_CODE", "SUBURB_NAME"))
# Save suburb estimates
write.csv(erp_suburb_1995_2023_asgs2021, "Population estimates data/For cardiovascular diseases/erp_suburb_1995_2023_asgs2021.csv", row.names = FALSE)
# Remove temporary objects
rm(cg_sa1_2021_sal_2021, merged_data, pop_cols, col)
dt <- copy(erp_suburb_1995_2023_asgs2021)
# Subset data for 20 random SA2_NAMEs
set.seed(12345)
suburb_names <- sample(unique(dt$`SUBURB_NAME`), 100)
dt <- dt[`SUBURB_NAME` %in% suburb_names]
# dt <- dt[grepl("brisbane", `SUBURB_NAME`, ignore.case = T)]
plot_ly(
data = dt,
# data = dt_agg,
x = ~Year,
y = ~m_all,
# y = ~f_all,
# y = ~p_all,
color = ~`SUBURB_NAME`, # Group by SUBURB_NAME
type = 'scatter',
mode = 'lines', # Line plot
hoverinfo = 'text',
text = ~paste("Year:", Year
, "<br>SAL:", `SUBURB_NAME`
, "<br>p_all:", p_all)
) %>%
layout(
title = "Population (p_all) Over Time by SAL",
xaxis = list(title = "Year"),
yaxis = list(title = "p_all"),
legend = list(title = list(text = "SUBURB_NAME"))
)
# Aggregate by year column and sum all columns disregarding SUBURB_NAME and SUBURB_CODE
dt_agg <- copy(erp_suburb_1995_2023_asgs2021)
dt_agg <- dt_agg[, SUBURB_NAME := NULL]
dt_agg <- dt_agg[, SUBURB_CODE := NULL]
dt_agg <- dt_agg[, lapply(.SD, sum), by = Year]
dt_agg <- melt(dt_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# Plot using Plotly
plot_ly(data = dt_agg, x = ~Year, y = ~Value, color = ~Category,
type = "scatter", mode = "lines") %>%
layout(title = "Trends Over Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"))
# Remove temporary objects
rm(suburb_names, dt, dt_agg)
Correspondence for converting SAL (suburbs) 2021 to SSC (suburbs) 2016 is available from:
Australian Bureau of Statistics. (2021). ASGS Geographic Correspondences (2021) Edition 3. ABS. https://data.gov.au/data/dataset/asgs-edition-3-2021-correspondences.
# Load correspondence file
cg_ssc_2016_sal_2021 <- as.data.table(read.csv('ABS data/CG_SSC_2016_SAL_2021.csv'))
# Convert "SA1_CODE_2021" and "SAL_CODE_2021" to numeric
cg_ssc_2016_sal_2021[, SSC_CODE_2016 := as.numeric(SSC_CODE_2016)]
cg_ssc_2016_sal_2021[, SAL_CODE_2021 := as.numeric(SAL_CODE_2021)]
# Subset rows where SA1_CODE_2021 starts with 3 [Queensland SA1s]
cg_ssc_2016_sal_2021 <- cg_ssc_2016_sal_2021[grepl("^3", SAL_CODE_2021)]
# Inspecting the quality of correspondence
plot(table(cg_ssc_2016_sal_2021$INDIV_TO_REGION_QLTY_INDICATOR))
# Ensure column names match for merging
setnames(cg_ssc_2016_sal_2021, "SAL_CODE_2021", "SUBURB_CODE")
# Merge suburb population data with the correspondence table
merged_data <- merge(erp_suburb_1995_2023_asgs2021,
cg_ssc_2016_sal_2021[, .(`SUBURB_CODE`, SSC_CODE_2016, SSC_NAME_2016, RATIO_FROM_TO)],
by = "SUBURB_CODE", all.x = TRUE, allow.cartesian = TRUE)
# Identify population columns
pop_cols <- setdiff(names(erp_suburb_1995_2023_asgs2021), c("Year", "SUBURB_CODE", "SUBURB_NAME"))
# Scale population estimates using RATIO_FROM_TO
for (col in pop_cols) {
merged_data[, (col) := get(col) * RATIO_FROM_TO]
}
# Aggregate data at the SSC (2016 suburb) level
erp_suburb_1995_2023_asgs2016 <- merged_data[, lapply(.SD, sum, na.rm = TRUE),
by = .(Year, SSC_CODE_2016, SSC_NAME_2016), .SDcols = pop_cols]
# Round population to nearest integer
for (col in pop_cols) {
erp_suburb_1995_2023_asgs2016[, (col) := round(get(col))]
}
# Making sure totals make sense
# all age groups collectively should match m_all, f_all, or p_all
# males and females of each age group should match p_all
# This time, instead of forcing all age groups to match with p_all (top down approach),
# a bottom up approach will be implemented forcing p_all to be the sum of all age groups.
# Because, for SA2s we had the state population as a benchmark, whereas we are using correspondence data this time.
erp_suburb_1995_2023_asgs2016[, m_all := rowSums(.SD), .SDcols = paste0("m_", c("00_24", "25_44", "45_64", "65_74", "75_84", "85_99"))]
erp_suburb_1995_2023_asgs2016[, f_all := rowSums(.SD), .SDcols = paste0("f_", c("00_24", "25_44", "45_64", "65_74", "75_84", "85_99"))]
erp_suburb_1995_2023_asgs2016[, p_00_24 := rowSums(.SD), .SDcols = c("m_00_24", "f_00_24")]
erp_suburb_1995_2023_asgs2016[, p_25_44 := rowSums(.SD), .SDcols = c("m_25_44", "f_25_44")]
erp_suburb_1995_2023_asgs2016[, p_45_64 := rowSums(.SD), .SDcols = c("m_45_64", "f_45_64")]
erp_suburb_1995_2023_asgs2016[, p_65_74 := rowSums(.SD), .SDcols = c("m_65_74", "f_65_74")]
erp_suburb_1995_2023_asgs2016[, p_75_84 := rowSums(.SD), .SDcols = c("m_75_84", "f_75_84")]
erp_suburb_1995_2023_asgs2016[, p_85_99 := rowSums(.SD), .SDcols = c("m_85_99", "f_85_99")]
erp_suburb_1995_2023_asgs2016[, p_all := rowSums(.SD), .SDcols = c("m_all", "f_all")]
# Rename columns for clarity
setnames(erp_suburb_1995_2023_asgs2016, c("SSC_CODE_2016", "SSC_NAME_2016"), c("SUBURB_CODE", "SUBURB_NAME"))
# Save suburb estimates
write.csv(erp_suburb_1995_2023_asgs2016, "Population estimates data/For cardiovascular diseases/erp_suburb_1995_2023_asgs2016.csv", row.names = FALSE)
# Remove temporary objects
rm(cg_ssc_2016_sal_2021, merged_data, col, pop_cols)
dt <- copy(erp_suburb_1995_2023_asgs2016)
# Subset data for 20 random SA2_NAMEs
set.seed(12345)
suburb_names <- sample(unique(dt$`SUBURB_NAME`), 100)
dt <- dt[`SUBURB_NAME` %in% suburb_names]
# dt <- dt[grepl("brisbane", `SUBURB_NAME`, ignore.case = T)]
plot_ly(
data = dt,
# data = dt_agg,
x = ~Year,
y = ~m_all,
# y = ~f_all,
# y = ~p_all,
color = ~`SUBURB_NAME`, # Group by SUBURB_NAME
type = 'scatter',
mode = 'lines', # Line plot
hoverinfo = 'text',
text = ~paste("Year:", Year
, "<br>SSC:", `SUBURB_NAME`
, "<br>p_all:", p_all)
) %>%
layout(
title = "Population (p_all) Over Time by SSC",
xaxis = list(title = "Year"),
yaxis = list(title = "p_all"),
legend = list(title = list(text = "SUBURB_NAME"))
)
# Aggregate by year column and sum all columns disregarding SUBURB_NAME and SUBURB_CODE
dt_agg <- copy(erp_suburb_1995_2023_asgs2016)
dt_agg <- dt_agg[, SUBURB_NAME := NULL]
dt_agg <- dt_agg[, SUBURB_CODE := NULL]
dt_agg <- dt_agg[, lapply(.SD, sum), by = Year]
dt_agg <- melt(dt_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# Plot using Plotly
plot_ly(data = dt_agg, x = ~Year, y = ~Value, color = ~Category,
type = "scatter", mode = "lines") %>%
layout(title = "Trends Over Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"))
# Remove temporary objects
rm(suburb_names, dt, dt_agg)
save.image("Population estimates data/For cardiovascular diseases/Population-estimates.RData")
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY-SA 4.0. Source code is available at https://github.com/ctrl-shift-vs/Population-estimates, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Singh, et al. (2025, April 9). Population estimates. Retrieved from https://github.com/ctrl-shift-vs/Population-estimates
BibTeX citation
@misc{singh2025population,
author = {Singh, Vishal and Cramb, Susanna and Cortes-Ramirez, Javier},
title = {Population estimates},
url = {https://github.com/ctrl-shift-vs/Population-estimates},
year = {2025}
}